
import dask.dataframe as dd
import pandas as pd
from pandas.core import api
# import ascify

rais_2019 = dd.read_csv('rais/rais_2019.csv',
                        dtype={'radiccnpj': 'object',
                               'numectps': 'object'})

apoios = dd.read_csv(
    "data/apoiamentos_aptos_alianca_pelo_brasil_feb_21.csv", encoding="latin1",
    dtype={'TÍTULO_ELEITOR': str}
)

apoios = dd.merge(apoios, rais_2019, left_on="NOME_ELEITOR", right_on="nome",
                  how='inner')

apoios = apoios.compute()

# ascifying (def in brasil.md) does not improve matching

# apoios['NOME_ELEITOR'] = apoios['NOME_ELEITOR'].apply(ascify)

# rais_2019['nome'] = rais_2019['nome'].apply(ascify)

counts = apoios[['NOME_ELEITOR', 'CPF']].value_counts().reset_index()

# The value sum based on both NOME and CPF

both = counts.groupby(['NOME_ELEITOR', 'CPF']).sum().reset_index()

# The value sum based on NOME only

name = counts[['NOME_ELEITOR', 0]].groupby(
    ['NOME_ELEITOR']).sum().reset_index()

# Check if both counts match, if they do, it's a case of a single CPF match to a single name whose contract information was recorded more than once during a year

counts = pd.merge(both, name, on='NOME_ELEITOR')

counts = counts[counts['0_x'] == counts['0_y']]

apoios = apoios[apoios['NOME_ELEITOR'].isin(counts['NOME_ELEITOR'])]

apoios.to_csv('rais/rais19_apoios.csv', index=False)
